import pymongo
from pymongo import MongoClient
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as pex
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")
mongodb_url = "mongodb+srv://saiteja:L4KMI01na9a7h5WD@c1.c1bek3y.mongodb.net/test"
client = MongoClient(mongodb_url)
db = client.get_database('Microsoft_Stock')
db
Database(MongoClient(host=['ac-jr6oepz-shard-00-00.c1bek3y.mongodb.net:27017', 'ac-jr6oepz-shard-00-01.c1bek3y.mongodb.net:27017', 'ac-jr6oepz-shard-00-02.c1bek3y.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', replicaset='atlas-1cfz2n-shard-0', tls=True), 'Microsoft_Stock')
collection = db.get_collection('Dataset')
collection
Collection(Database(MongoClient(host=['ac-jr6oepz-shard-00-00.c1bek3y.mongodb.net:27017', 'ac-jr6oepz-shard-00-01.c1bek3y.mongodb.net:27017', 'ac-jr6oepz-shard-00-02.c1bek3y.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', replicaset='atlas-1cfz2n-shard-0', tls=True), 'Microsoft_Stock'), 'Dataset')
cursor = collection.find()
main_list = []
for document in cursor:
main_list.append([document['Date'], document['Open'], document['High'], document['Low'], document['Close'], document['Volume']])
microsoft = pd.DataFrame(main_list, columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume'])
microsoft
| Date | Open | High | Low | Close | Volume | |
|---|---|---|---|---|---|---|
| 0 | 4/1/2015 16:00 | 40.60 | 40.76 | 40.31 | 40.72 | 36865322 |
| 1 | 4/2/2015 16:00 | 40.66 | 40.74 | 40.12 | 40.29 | 37487476 |
| 2 | 4/6/2015 16:00 | 40.34 | 41.78 | 40.18 | 41.55 | 39223692 |
| 3 | 4/7/2015 16:00 | 41.61 | 41.91 | 41.31 | 41.53 | 28809375 |
| 4 | 4/8/2015 16:00 | 41.48 | 41.69 | 41.04 | 41.42 | 24753438 |
| ... | ... | ... | ... | ... | ... | ... |
| 1506 | 3/25/2021 16:00 | 235.30 | 236.94 | 231.57 | 232.34 | 34061853 |
| 1507 | 3/26/2021 16:00 | 231.55 | 236.71 | 231.55 | 236.48 | 25479853 |
| 1508 | 3/29/2021 16:00 | 236.59 | 236.80 | 231.88 | 235.24 | 25227455 |
| 1509 | 3/30/2021 16:00 | 233.53 | 233.85 | 231.10 | 231.85 | 24792012 |
| 1510 | 3/31/2021 16:00 | 232.91 | 239.10 | 232.39 | 235.77 | 43623471 |
1511 rows × 6 columns
As you can see below the date column is in the form of an object we need to convert this into the datetime data type
microsoft.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1511 entries, 0 to 1510 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 1511 non-null object 1 Open 1511 non-null float64 2 High 1511 non-null float64 3 Low 1511 non-null float64 4 Close 1511 non-null float64 5 Volume 1511 non-null int64 dtypes: float64(4), int64(1), object(1) memory usage: 71.0+ KB
microsoft.describe()
| Open | High | Low | Close | Volume | |
|---|---|---|---|---|---|
| count | 1511.000000 | 1511.000000 | 1511.000000 | 1511.000000 | 1.511000e+03 |
| mean | 107.385976 | 108.437472 | 106.294533 | 107.422091 | 3.019863e+07 |
| std | 56.691333 | 57.382276 | 55.977155 | 56.702299 | 1.425266e+07 |
| min | 40.340000 | 40.740000 | 39.720000 | 40.290000 | 1.016120e+05 |
| 25% | 57.860000 | 58.060000 | 57.420000 | 57.855000 | 2.136213e+07 |
| 50% | 93.990000 | 95.100000 | 92.920000 | 93.860000 | 2.662962e+07 |
| 75% | 139.440000 | 140.325000 | 137.825000 | 138.965000 | 3.431962e+07 |
| max | 245.030000 | 246.130000 | 242.920000 | 244.990000 | 1.352271e+08 |
microsoft["Date"] = pd.to_datetime(microsoft["Date"])
dates = len(microsoft["Date"])
for i in range(dates):
microsoft["Date"][i] = microsoft["Date"][i].strftime('%Y-%m-%d')
microsoft
| Date | Open | High | Low | Close | Volume | |
|---|---|---|---|---|---|---|
| 0 | 2015-04-01 | 40.60 | 40.76 | 40.31 | 40.72 | 36865322 |
| 1 | 2015-04-02 | 40.66 | 40.74 | 40.12 | 40.29 | 37487476 |
| 2 | 2015-04-06 | 40.34 | 41.78 | 40.18 | 41.55 | 39223692 |
| 3 | 2015-04-07 | 41.61 | 41.91 | 41.31 | 41.53 | 28809375 |
| 4 | 2015-04-08 | 41.48 | 41.69 | 41.04 | 41.42 | 24753438 |
| ... | ... | ... | ... | ... | ... | ... |
| 1506 | 2021-03-25 | 235.30 | 236.94 | 231.57 | 232.34 | 34061853 |
| 1507 | 2021-03-26 | 231.55 | 236.71 | 231.55 | 236.48 | 25479853 |
| 1508 | 2021-03-29 | 236.59 | 236.80 | 231.88 | 235.24 | 25227455 |
| 1509 | 2021-03-30 | 233.53 | 233.85 | 231.10 | 231.85 | 24792012 |
| 1510 | 2021-03-31 | 232.91 | 239.10 | 232.39 | 235.77 | 43623471 |
1511 rows × 6 columns
microsoft.isna().sum()
Date 0 Open 0 High 0 Low 0 Close 0 Volume 0 dtype: int64
As you can see below there arent any outliers in the dataset so no removal is needed for the same.
box_df = microsoft
box_df = box_df.drop('Volume', axis=1)
sns.boxplot(data = box_df)
<Axes: >
fig = plt.figure(figsize = (12, 6))
ax = plt.axes()
ax.set(title = 'Date vs Closing Price of the Stock', xlabel = 'Year', ylabel = 'Close Price')
x = microsoft['Date']
y = microsoft['Close']
ax.plot(x, y)
[<matplotlib.lines.Line2D at 0x7f4eac43f700>]
fig = plt.figure(figsize = (12, 6))
ax = plt.axes()
ax.set(title = 'Date vs Opening Price of the Stock', xlabel = 'Year', ylabel = 'Open Price')
x = microsoft['Date']
y = microsoft['Open']
ax.plot(x, y, color = 'orange')
[<matplotlib.lines.Line2D at 0x7f4eac2a66e0>]
fig = plt.figure(figsize = (12, 6))
ax = plt.axes()
ax.set(title = 'Date vs Highest Price of the Stock', xlabel = 'Year', ylabel = 'High Price')
x = microsoft['Date']
y = microsoft['High']
ax.plot(x, y, color = 'red')
[<matplotlib.lines.Line2D at 0x7f4eac077a00>]
fig = plt.figure(figsize = (12, 6))
ax = plt.axes()
ax.set(title = 'Date vs Lowest Price of the Stock', xlabel = 'Year', ylabel = 'Low Price')
x = microsoft['Date']
y = microsoft['Low']
ax.plot(x, y, color = 'green')
[<matplotlib.lines.Line2D at 0x7f4eac0f9480>]
fig = plt.figure(figsize = (12, 6))
ax = plt.axes()
ax.set(title = 'Date vs Volume of the Stock Purchased', xlabel = 'Year', ylabel = 'Volume')
x = microsoft['Date']
y = microsoft['Volume']
ax.plot(x, y, color = 'purple')
[<matplotlib.lines.Line2D at 0x7f4e9e752650>]
manipulate = microsoft.copy()
manipulate['Month'] = pd.DatetimeIndex(manipulate['Date']).month
manipulate.head()
| Date | Open | High | Low | Close | Volume | Month | |
|---|---|---|---|---|---|---|---|
| 0 | 2015-04-01 | 40.60 | 40.76 | 40.31 | 40.72 | 36865322 | 4 |
| 1 | 2015-04-02 | 40.66 | 40.74 | 40.12 | 40.29 | 37487476 | 4 |
| 2 | 2015-04-06 | 40.34 | 41.78 | 40.18 | 41.55 | 39223692 | 4 |
| 3 | 2015-04-07 | 41.61 | 41.91 | 41.31 | 41.53 | 28809375 | 4 |
| 4 | 2015-04-08 | 41.48 | 41.69 | 41.04 | 41.42 | 24753438 | 4 |
yearly_stocks = manipulate.groupby(by = 'Month').mean()
colors = ['#e8dff5', '#fce1e4', '#fcf4dd', '#ddedea']
fig = pex.bar(yearly_stocks, x = yearly_stocks.index, y = yearly_stocks.iloc[:, 1:5].columns, title = 'Column-wise stock price distribution by Month', barmode = 'group', text_auto = True, color_discrete_sequence = colors)
fig.update_layout(
plot_bgcolor = 'rgba(0,0,0,0)',
paper_bgcolor = 'rgba(0,0,0,0)',
xaxis = dict(showline = True, linewidth = 2, linecolor = 'grey'),
yaxis = dict(showline = True, linewidth = 2, linecolor = 'grey'),
margin = dict(l = 50, r = 50, b = 50, t = 80, pad = 4),
width = 950,
height = 500
)
fig.show()
resample = microsoft.copy()
resample = resample.set_index('Date')
max_val_resample = resample.resample(rule = 'A').max()
max_val_resample
| Open | High | Low | Close | Volume | |
|---|---|---|---|---|---|
| Date | |||||
| 2015-12-31 | 56.47 | 56.85 | 56.29 | 56.55 | 135227059 |
| 2016-12-31 | 63.84 | 64.10 | 63.41 | 63.62 | 133502985 |
| 2017-12-31 | 87.12 | 87.50 | 86.23 | 86.85 | 71066729 |
| 2018-12-31 | 115.42 | 116.18 | 114.93 | 115.61 | 111242070 |
| 2019-12-31 | 159.45 | 159.55 | 158.22 | 158.96 | 55636391 |
| 2020-12-31 | 229.27 | 232.86 | 227.35 | 231.65 | 97073557 |
| 2021-12-31 | 245.03 | 246.13 | 242.92 | 244.99 | 69870638 |
fig = plt.figure(figsize = (12, 6))
ax = plt.axes()
ax.set(title = 'Year End Maximum Values by Resampling', xlabel = 'Year', ylabel = 'Close Price')
x = max_val_resample.index
y = max_val_resample['Close']
ax.plot(x, y)
[<matplotlib.lines.Line2D at 0x7f4e9cd3d990>]
fig, ax = plt.subplots(figsize = (12, 6))
sns.set_palette('muted')
ax.set(title = 'Yearwise Volumes of Stocks Purchased', xlabel = 'Year', ylabel = 'Volume')
x = max_val_resample.index
y = max_val_resample['Volume']
sns.barplot(x = x, y = y, ax = ax)
plt.tight_layout()
plt.show()
correlations = microsoft.corr()
round(correlations, 2)
sns.heatmap(correlations)
<Axes: >
final_data = microsoft
final_data = final_data.drop(['Open', 'High', 'Low', 'Volume'], axis=1)
final_data = final_data.set_index('Date')
final_data
| Close | |
|---|---|
| Date | |
| 2015-04-01 | 40.72 |
| 2015-04-02 | 40.29 |
| 2015-04-06 | 41.55 |
| 2015-04-07 | 41.53 |
| 2015-04-08 | 41.42 |
| ... | ... |
| 2021-03-25 | 232.34 |
| 2021-03-26 | 236.48 |
| 2021-03-29 | 235.24 |
| 2021-03-30 | 231.85 |
| 2021-03-31 | 235.77 |
1511 rows × 1 columns
Hypothesis for testing if the data is stationary:
def adfuller_test(sales):
res = adfuller(sales)
labels = ['ADF Test Statistic', 'p-value', '#lags used', 'Number of observations used']
for value, label in zip(res, labels):
print(label+' : '+str(value))
if res[1] <= 0.05:
print("Strong evidence against the null hypothesis, reject the null hypothesis.")
else:
print("Weak evidence against the null hypothesis, accept the null hypothesis.")
adfuller_test(final_data['Close'])
ADF Test Statistic : 1.7371362899270977 p-value : 0.9982158366942122 #lags used : 24 Number of observations used : 1486 Weak evidence against the null hypothesis, accept the null hypothesis.
As we can see in the result above the null hypothesis was accepted which means that we can conclude that the data we have at hand is not stationary and we have to therefore perform the step for differentiation.
sm.tsa.seasonal_decompose(final_data.Close, period = 30).plot()
result = sm.tsa.stattools.adfuller(final_data.Close)
plt.show()
final_data = pd.DataFrame(final_data['Close'])
final_data['Difference'] = list((final_data['Close'] - final_data['Close'].shift(1).fillna(method = 'bfill')))
fig = plt.figure(figsize = (12, 6))
ax = plt.axes()
ax.set(title = 'Differenced Close Price', xlabel = 'Year', ylabel = 'Differenced Close Price')
x = final_data.index
y = final_data['Difference']
ax.plot(x, y)
[<matplotlib.lines.Line2D at 0x7f4e83fb0b80>]
adfuller_test(final_data['Difference'])
ADF Test Statistic : -10.043852364231784 p-value : 1.472030729039202e-17 #lags used : 24 Number of observations used : 1486 Strong evidence against the null hypothesis, reject the null hypothesis.
As we can see in the result above the null hypothesis was rejected so now we can use this data to plot ACF and PACF plot to find p and q values for ARIMA model.
sm.graphics.tsa.plot_acf(final_data['Difference'])
plt.title("ACF Plot", size = 15)
plt.show()
sm.graphics.tsa.plot_pacf(final_data['Difference'])
plt.title("PACF Plot", size = 15)
plt.show()
final_data = final_data.drop('Difference', axis=1)
final_data
| Close | |
|---|---|
| Date | |
| 2015-04-01 | 40.72 |
| 2015-04-02 | 40.29 |
| 2015-04-06 | 41.55 |
| 2015-04-07 | 41.53 |
| 2015-04-08 | 41.42 |
| ... | ... |
| 2021-03-25 | 232.34 |
| 2021-03-26 | 236.48 |
| 2021-03-29 | 235.24 |
| 2021-03-30 | 231.85 |
| 2021-03-31 | 235.77 |
1511 rows × 1 columns
train = final_data['2015-04-01':'2019-12-31']
test = final_data['2020-01-02':]
train
| Close | |
|---|---|
| Date | |
| 2015-04-01 | 40.72 |
| 2015-04-02 | 40.29 |
| 2015-04-06 | 41.55 |
| 2015-04-07 | 41.53 |
| 2015-04-08 | 41.42 |
| ... | ... |
| 2019-12-24 | 157.38 |
| 2019-12-26 | 158.67 |
| 2019-12-27 | 158.96 |
| 2019-12-30 | 157.59 |
| 2019-12-31 | 157.70 |
1197 rows × 1 columns
test
| Close | |
|---|---|
| Date | |
| 2020-01-02 | 160.62 |
| 2020-01-03 | 158.62 |
| 2020-01-06 | 159.03 |
| 2020-01-07 | 157.58 |
| 2020-01-08 | 160.09 |
| ... | ... |
| 2021-03-25 | 232.34 |
| 2021-03-26 | 236.48 |
| 2021-03-29 | 235.24 |
| 2021-03-30 | 231.85 |
| 2021-03-31 | 235.77 |
314 rows × 1 columns
fig = plt.figure(figsize = (12,6))
ax = plt.axes()
ax.set(title = 'Train Test Graph', xlabel = 'Dates', ylabel = 'Prices')
ax.plot(train, 'lightblue', label = 'Training Data')
ax.plot(test, 'lightpink', label = 'Testing Data')
ax.legend()
<matplotlib.legend.Legend at 0x7f4e83d39c60>
train_array = train['Close'].values
test_array = test['Close'].values
predict = list()
hist = [i for i in train_array]
for n in range(len(test_array)):
arima_model = sm.tsa.arima.ARIMA(hist, order = (2,1,0))
fit_model = arima_model.fit()
forecasting = fit_model.forecast()
yhat = forecasting[0]
predict.append(yhat)
observed = test_array[n]
hist.append(observed)
mse = mean_squared_error(test_array, predict)
print('Mean Squared Error for the Predicted Data using ARIMA: %.3f' % mse)
Mean Squared Error for the Predicted Data using ARIMA: 19.676
fig = plt.figure(figsize=(12,6))
ax = plt.axes()
ax.set(title='Microsoft Stock Price Prediction using ARIMA', xlabel='Dates', ylabel='Prices')
ax.plot(train, color='lightblue', label='Training Data')
ax.plot(test.index, predict, color='lightgreen', label='Predicted Price', marker='o', linestyle='dashed')
ax.plot(test.index, test['Close'], color='lightpink', label='Actual Price')
ax.legend()
<matplotlib.legend.Legend at 0x7f4e83db7700>
model = sm.tsa.statespace.SARIMAX(final_data, order = (3, 0, 2), seasonal_order = (2, 1, 0, 12))
best_model = model.fit()
print(best_model.summary())
SARIMAX Results
===========================================================================================
Dep. Variable: Close No. Observations: 1511
Model: SARIMAX(3, 0, 2)x(2, 1, [], 12) Log Likelihood -3553.252
Date: Sun, 30 Apr 2023 AIC 7122.505
Time: 00:45:11 BIC 7165.005
Sample: 0 HQIC 7138.338
- 1511
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 1.0589 0.107 9.852 0.000 0.848 1.270
ar.L2 -0.2946 0.082 -3.585 0.000 -0.456 -0.134
ar.L3 0.1741 0.068 2.575 0.010 0.042 0.307
ma.L1 -0.3393 0.107 -3.165 0.002 -0.549 -0.129
ma.L2 0.3395 0.058 5.831 0.000 0.225 0.454
ar.S.L12 -0.6264 0.013 -46.871 0.000 -0.653 -0.600
ar.S.L24 -0.3804 0.014 -27.298 0.000 -0.408 -0.353
sigma2 6.6700 0.115 57.992 0.000 6.445 6.895
===================================================================================
Ljung-Box (L1) (Q): 0.10 Jarque-Bera (JB): 4325.30
Prob(Q): 0.75 Prob(JB): 0.00
Heteroskedasticity (H): 21.14 Skew: -0.05
Prob(H) (two-sided): 0.00 Kurtosis: 11.32
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
pred_start = test.index[0]
pred_end = test.index[-1]
forecast = best_model.predict(start = pred_start, end = pred_end, dynamic = True)
test['forecast'] = best_model.predict(start = pred_start, end = pred_end)
forecast = test['forecast'].values
mse = mean_squared_error(test_array, forecast)
print('Mean Squared Error for the Predicted Data using SARIMAX: %.3f' % mse)
Mean Squared Error for the Predicted Data using SARIMAX: 23.927
fig = plt.figure(figsize=(12,6))
ax = plt.axes()
ax.set(title='Microsoft Stock Price Prediction using SARIMAX', xlabel='Dates', ylabel='Prices')
ax.plot(train, color='lightblue', label='Training Data')
ax.plot(test.index, forecast, color='lightgreen', label='Predicted Price', marker='o', linestyle='dashed')
ax.plot(test.index, test['Close'], color='lightpink', label='Actual Price')
ax.legend()
<matplotlib.legend.Legend at 0x7f4e80563520>